package server

import (
	"fmt"
	"github.com/PuerkitoBio/goquery"
	"github.com/go-resty/resty/v2"
	"github.com/paleblueyk/logger"
	"gitee.com/Yazzyk/zu-fang/config"
	"gitee.com/Yazzyk/zu-fang/model"
	_const "gitee.com/Yazzyk/zu-fang/pkg/const"
	"io/ioutil"
	"net/http"
	"os"
	"strconv"
	"strings"
	"time"
)

// 加了关键词出来 就 TM 20+条结果，一个想要的结果都没有 不想优化了

// DoubanSpider 豆瓣爬虫
func DoubanSpider() {
	var (
		page     uint
		talkList []model.Talk
	)
	for {
		start := page * 30
		resp, err := resty.New().R().SetCookie(&http.Cookie{
			Name:   "dbcl2",
			Value:  config.App.Douban.Dbcl2,
			Path:   "/",
			Domain: ".douban.com",
			MaxAge: 60 * 60 * 60 * 24 * 30,
		}).Get(fmt.Sprintf("%s/group/%d/discussion?start=%d&type=new", _const.DouBanUrl, config.App.Douban.TopicID, start))
		if err != nil {
			logger.Error(err)
			return
		}
		//ioutil.WriteFile("./index.html", resp.Body(), os.ModePerm)
		pageTalkList := handlerDouBanInfo(resp)
		//去重
		if page == 0 {
			talkList = pageTalkList
			goto result
		}
		talkList = append(talkList, pageTalkList...)
		if len(pageTalkList) < 30 {
			break
		}
	result:
		page += 1
	}

	result := handlerDouBanFilter(talkList, "")// 传入关键词
	var out string
	out += "标题,链接,作者,评论,最后评论时间"
	for _, talk := range result {
		out += "\n"
		out += fmt.Sprintf("%s,%s,%s,%d条,%s", talk.Title, talk.Link, talk.Author, talk.Comment, talk.LastCommentTime.String())
	}
	ioutil.WriteFile("./info.csv", []byte(out), os.ModePerm)
}

// 处理豆瓣的页面信息
func handlerDouBanInfo(resp *resty.Response) []model.Talk {
	var (
		talkList []model.Talk
	)
	// html页面
	doc, err := goquery.NewDocumentFromReader(strings.NewReader(resp.String()))
	if err != nil {
		logger.Error(err)
		return nil
	}
	// 内容处理,获取本页主要内容
	{
		doc.Find("#wrapper #content table tr").EachWithBreak(func(i int, line *goquery.Selection) bool {
			// 跳过表头
			if i == 0 {
				return true
			}
			var talk model.Talk
			line.Find("td").EachWithBreak(func(i int, item *goquery.Selection) bool {
				itemResult := strings.ReplaceAll(item.Text(), "\n", "")
				itemResult = strings.ReplaceAll(itemResult, "\t", "")
				itemResult = strings.ReplaceAll(itemResult, " ", "")
				//logger.Info("idx: %d, content: %s", i, itemResult)
				switch i {
				case 0: // 标题 讨论
					talk.Title = itemResult
					talk.Link, _ = item.Find("a").Attr("href")
				case 1: // 作者
					talk.Author = itemResult
				case 2: // 回应
					if itemResult == "" {
						talk.Comment = 0
						break
					}
					talk.Comment, _ = strconv.Atoi(itemResult)
				case 3: // 最后回应时间 01-02 15:04
					year := time.Now().Year()
					t, _ := time.Parse("2006-01-0215:04", fmt.Sprintf("%d-%s", year, itemResult))
					talk.LastCommentTime = &t
				}
				return true
			})
			talkList = append(talkList, talk)
			return true
		})
	}
	return talkList
}

// 豆瓣租房信息筛选
func handlerDouBanFilter(list []model.Talk, keyword ...string) []model.Talk {
	var (
		result []model.Talk
	)
	tmpMap := make(map[string]bool)
	for _, talk := range list {
		if strings.Contains(talk.Title, "求") || strings.Contains(talk.Title, "找") {
			logger.Info("去掉疑似求助信息: ", talk.Title)
			continue
		}
		for _, key := range keyword {
			if strings.Contains(talk.Title, key) {
				if _, exist := tmpMap[talk.Link]; !exist {
					tmpMap[talk.Link] = true
					result = append(result, talk)
				}
			}
		}
	}
	return result
}
